import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score,precision_recall_curve,roc_curve
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import KFold
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import accuracy_score, precision_score, recall_score # accuracy, recall, and precision metrics
from sklearn.metrics import classification_report # classification report
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix # describe classifier performance
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
def files_df(sequence_list):
final_df = pd.DataFrame()
for sequence_file in sequence_list:
temp_df = pd.DataFrame(pd.read_csv(sequence_file, delimiter='EXIT\n', header=None))
final_df = final_df.append(pd.Series(' '.join(temp_df[0].values)), ignore_index=True)
return final_df
sequence_list = glob.glob("D:/UGM/DTETI FT UGM/!!MatKul!!/Semester 5/Pemrosesan Bahasa Alami/CDMC2019MiniTask/LOGS/LOGS/*.seq")
df_fullver = files_df(sequence_list)
df_fullver.columns=['Commands']
df_fullver.insert(0, 'seq_file_num', range(1, 1 + len(df_fullver)))
df_fullver.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4167 entries, 0 to 4166 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 seq_file_num 4167 non-null int32 1 Commands 4167 non-null object dtypes: int32(1), object(1) memory usage: 49.0+ KB
df_fullver.seq_file_num = ["{:04d}".format(item) for item in df_fullver.seq_file_num]
df_fullver.head()
| seq_file_num | Commands | |
|---|---|---|
| 0 | 0001 | execve ioctl ioctl prctl gettimeofday getpid g... |
| 1 | 0002 | execve ioctl ioctl time getpid time getpid soc... |
| 2 | 0003 | execve ioctl ioctl prctl time getpid time getp... |
| 3 | 0004 | execve ioctl ioctl time getpid time getpid soc... |
| 4 | 0005 | execve ioctl ioctl prctl time getpid time getp... |
df_train = pd.DataFrame(pd.read_csv("D:/UGM/DTETI FT UGM/!!MatKul!!/Semester 5/Pemrosesan Bahasa Alami/CDMC2019MiniTask/Train.csv", sep=',', header=None))
df_train.columns=['Commands', 'malware_types']
df_train.Commands = ["{:04d}".format(item) for item in df_train.Commands]
df_train.head()
| Commands | malware_types | |
|---|---|---|
| 0 | 3533 | 2 |
| 1 | 1270 | 5 |
| 2 | 3270 | 1 |
| 3 | 1323 | 2 |
| 4 | 1064 | 2 |
df_train['Commands'] = df_train['Commands'].map(df_fullver.set_index('seq_file_num')['Commands'])
df_train.head()
| Commands | malware_types | |
|---|---|---|
| 0 | execve ioctl ioctl prctl time getpid time getp... | 2 |
| 1 | execve uname brk brk set_tls set_tid_address s... | 5 |
| 2 | execve brk brk set_tls ioctl ioctl access gete... | 1 |
| 3 | execve ioctl ioctl unlink time getpid getppid ... | 2 |
| 4 | execve ioctl ioctl time getpid time getpid soc... | 2 |
df_train.shape
(2500, 2)
df_test = pd.DataFrame(pd.read_csv("D:/UGM/DTETI FT UGM/!!MatKul!!/Semester 5/Pemrosesan Bahasa Alami/CDMC2019MiniTask/Test.csv", sep=',', header=None))
df_test.columns=['Commands']
df_test.Commands = ["{:04d}".format(item) for item in df_test.Commands]
df_test.head()
| Commands | |
|---|---|
| 0 | 0054 |
| 1 | 1460 |
| 2 | 1312 |
| 3 | 1230 |
| 4 | 3077 |
df_test['Commands'] = df_test['Commands'].map(df_fullver.set_index('seq_file_num')['Commands'])
df_test.head()
| Commands | |
|---|---|
| 0 | execve mmap2 cacheflush cacheflush readlink ca... |
| 1 | execve brk brk set_tls ioctl ioctl rt_sigprocm... |
| 2 | execve ioctl ioctl time getpid time getpid soc... |
| 3 | execve ioctl ioctl time getpid time getpid soc... |
| 4 | execve brk brk set_tls ioctl ioctl gettimeofda... |
df_test.shape
(1667, 1)
df_train.head()
| Commands | malware_types | |
|---|---|---|
| 0 | execve ioctl ioctl prctl time getpid time getp... | 2 |
| 1 | execve uname brk brk set_tls set_tid_address s... | 5 |
| 2 | execve brk brk set_tls ioctl ioctl access gete... | 1 |
| 3 | execve ioctl ioctl unlink time getpid getppid ... | 2 |
| 4 | execve ioctl ioctl time getpid time getpid soc... | 2 |
df_train.malware_types.value_counts()
2 1359 1 925 5 172 4 24 3 20 Name: malware_types, dtype: int64
def word_cloud(df, pixwidth=720, pixheight=450, column="index", counts="count"):
data= [dict(name="dataset", values=df.to_dict(orient="records"))]
wordcloud = {
"data": data
}
return wordcloud
def distplot(df):
return ff.create_distplot([df['length']], ['length'], bin_size=10).show()
from collections import defaultdict
def mk_wordcloud(df):
corpus = df.Commands.values.tolist()
final = defaultdict(int) #Declaring an empty dictionary for count (Saves ram usage)
for words in corpus:
for word in words.split():
final[word]+=1
corpus = pd.Series(final) #Creating a dataframe from the final default dict
return word_cloud(corpus.to_frame(name="count").reset_index(), pixheight=600, pixwidth=900)
mk_wordcloud(df_train[df_train.malware_types==1])
{'data': [{'name': 'dataset',
'values': [{'index': 'execve', 'count': 2004},
{'index': 'brk', 'count': 183573},
{'index': 'set_tls', 'count': 542},
{'index': 'ioctl', 'count': 12491},
{'index': 'access', 'count': 1252},
{'index': 'geteuid32', 'count': 152},
{'index': 'prctl', 'count': 1533},
{'index': 'gettimeofday', 'count': 104083},
{'index': 'getpid', 'count': 246553},
{'index': 'socket', 'count': 143771},
{'index': 'connect', 'count': 137451},
{'index': 'getsockname', 'count': 48340},
{'index': 'open', 'count': 407893},
{'index': 'read', 'count': 2622161},
{'index': 'close', 'count': 175717},
{'index': 'write', 'count': 4037},
{'index': 'clone', 'count': 1244},
{'index': 'wait4', 'count': 1398},
{'index': 'SIGCHLD', 'count': 1466},
{'index': 'exit_group', 'count': 1596},
{'index': 'EXIT', 'count': 4711},
{'index': 'getppid', 'count': 14445},
{'index': 'times', 'count': 1798},
{'index': 'fcntl64', 'count': 29197},
{'index': 'setsockopt', 'count': 10631},
{'index': 'sendto', 'count': 5396797},
{'index': 'recvfrom', 'count': 5320719},
{'index': '_newselect', 'count': 73771},
{'index': 'rt_sigaction', 'count': 41556},
{'index': 'getsockopt', 'count': 84011},
{'index': 'nanosleep', 'count': 26970},
{'index': 'fork', 'count': 5460},
{'index': 'exit', 'count': 3118},
{'index': 'fcntl', 'count': 171147},
{'index': 'time', 'count': 478603},
{'index': 'unlink', 'count': 130},
{'index': 'send', 'count': 29885},
{'index': 'kill', 'count': 1078},
{'index': 'chdir', 'count': 274},
{'index': 'rt_sigprocmask', 'count': 45749},
{'index': 'set_thread_area', 'count': 796},
{'index': 'set_tid_address', 'count': 1090},
{'index': 'fstat64', 'count': 3934},
{'index': 'mmap2', 'count': 2843},
{'index': 'mprotect', 'count': 1988},
{'index': 'getuid', 'count': 901},
{'index': 'uname', 'count': 779},
{'index': 'stat64', 'count': 6685},
{'index': 'writev', 'count': 445},
{'index': 'geteuid', 'count': 247},
{'index': 'bind', 'count': 2028},
{'index': 'listen', 'count': 1982},
{'index': 'SIGSEGV', 'count': 174},
{'index': 'KILL', 'count': 482},
{'index': 'setsid', 'count': 377},
{'index': 'cacheflush', 'count': 1069},
{'index': 'readlink', 'count': 82944},
{'index': 'munmap', 'count': 196},
{'index': 'getdents', 'count': 6573},
{'index': 'mmap', 'count': 591},
{'index': 'fstat', 'count': 6642},
{'index': 'getdents64', 'count': 16897},
{'index': 'pipe', 'count': 95},
{'index': 'recv', 'count': 17939},
{'index': 'clock_gettime', 'count': 1988},
{'index': 'setuid32', 'count': 12},
{'index': 'setresuid32', 'count': 12},
{'index': 'waitpid', 'count': 32},
{'index': 'setuid', 'count': 23},
{'index': 'setresuid', 'count': 23},
{'index': 'getuid32', 'count': 331},
{'index': 'SIGTRAP', 'count': 28},
{'index': 'sigreturn', 'count': 217},
{'index': 'gettid', 'count': 285},
{'index': 'epoll_create1', 'count': 17},
{'index': 'epoll_ctl', 'count': 68},
{'index': 'poll', 'count': 311},
{'index': 'epoll_pwait', 'count': 17},
{'index': 'lstat64', 'count': 46},
{'index': 'chmod', 'count': 15},
{'index': 'SIGPIPE', 'count': 168},
{'index': 'restart_syscall', 'count': 2},
{'index': 'getrlimit', 'count': 4},
{'index': 'setrlimit', 'count': 4},
{'index': 'vfork', 'count': 93},
{'index': 'getcwd', 'count': 105},
{'index': 'dup2', 'count': 240},
{'index': 'readv', 'count': 171},
{'index': '_llseek', 'count': 102},
{'index': 'sendfile64', 'count': 209},
{'index': 'rename', 'count': 35},
{'index': 'utimensat', 'count': 5},
{'index': 'stat', 'count': 21164},
{'index': 'flock', 'count': 25},
{'index': 'tgkill', 'count': 10},
{'index': 'SIGILL', 'count': 1},
{'index': 'tkill', 'count': 2},
{'index': 'shmget', 'count': 23},
{'index': 'shmat', 'count': 19},
{'index': 'shmdt', 'count': 19},
{'index': 'prlimit64', 'count': 4},
{'index': 'futex', 'count': 4},
{'index': 'ugetrlimit', 'count': 1},
{'index': 'umask', 'count': 1},
{'index': 'msgget', 'count': 2}]}]}
mk_wordcloud(df_train[df_train.malware_types==2])
{'data': [{'name': 'dataset',
'values': [{'index': 'execve', 'count': 1388},
{'index': 'ioctl', 'count': 20402},
{'index': 'prctl', 'count': 744},
{'index': 'time', 'count': 14489},
{'index': 'getpid', 'count': 2783},
{'index': 'fork', 'count': 7614},
{'index': 'wait4', 'count': 1315},
{'index': 'SIGCHLD', 'count': 1365},
{'index': 'exit', 'count': 7582},
{'index': 'EXIT', 'count': 8066},
{'index': 'chdir', 'count': 1331},
{'index': 'setuid', 'count': 178},
{'index': 'setresuid', 'count': 178},
{'index': 'rt_sigaction', 'count': 26615},
{'index': 'socket', 'count': 826678},
{'index': 'fcntl', 'count': 1460841},
{'index': 'connect', 'count': 803414},
{'index': '_newselect', 'count': 805396},
{'index': 'getsockopt', 'count': 799350},
{'index': 'rt_sigprocmask', 'count': 32934},
{'index': 'nanosleep', 'count': 23821},
{'index': 'close', 'count': 842832},
{'index': 'unlink', 'count': 196},
{'index': 'getppid', 'count': 316},
{'index': 'times', 'count': 264},
{'index': 'getsockname', 'count': 1069},
{'index': 'open', 'count': 19441},
{'index': 'read', 'count': 730717},
{'index': 'setsid', 'count': 957},
{'index': 'write', 'count': 18940},
{'index': 'brk', 'count': 1674},
{'index': 'set_tls', 'count': 126},
{'index': 'gettimeofday', 'count': 18127},
{'index': 'clone', 'count': 483},
{'index': 'exit_group', 'count': 484},
{'index': 'setuid32', 'count': 96},
{'index': 'setresuid32', 'count': 96},
{'index': 'fcntl64', 'count': 200196},
{'index': 'access', 'count': 18857},
{'index': 'mmap', 'count': 128},
{'index': 'cacheflush', 'count': 336},
{'index': 'mprotect', 'count': 162},
{'index': 'readlink', 'count': 158},
{'index': 'munmap', 'count': 58},
{'index': 'bind', 'count': 8},
{'index': 'listen', 'count': 8},
{'index': 'setpgid', 'count': 6},
{'index': 'umask', 'count': 3},
{'index': 'send', 'count': 4579},
{'index': 'poll', 'count': 441},
{'index': 'recv', 'count': 12913},
{'index': 'set_thread_area', 'count': 46},
{'index': 'waitpid', 'count': 35},
{'index': 'geteuid', 'count': 19},
{'index': 'setsockopt', 'count': 13734},
{'index': 'fstat', 'count': 457},
{'index': 'getdents', 'count': 765},
{'index': 'getrlimit', 'count': 19},
{'index': 'set_tid_address', 'count': 27},
{'index': 'fstat64', 'count': 29},
{'index': 'mmap2', 'count': 229},
{'index': 'getuid', 'count': 27},
{'index': 'uname', 'count': 23},
{'index': 'stat64', 'count': 82},
{'index': 'stat', 'count': 438},
{'index': 'setrlimit', 'count': 17},
{'index': 'sendto', 'count': 10045},
{'index': 'recvfrom', 'count': 9537},
{'index': 'SIGPIPE', 'count': 7},
{'index': 'SIGSEGV', 'count': 7},
{'index': 'KILL', 'count': 8},
{'index': 'chroot', 'count': 6},
{'index': 'geteuid32', 'count': 7},
{'index': 'kill', 'count': 2},
{'index': 'rename', 'count': 3},
{'index': 'getdents64', 'count': 864},
{'index': 'lstat64', 'count': 122},
{'index': 'rmdir', 'count': 27},
{'index': 'writev', 'count': 18},
{'index': 'sigreturn', 'count': 1},
{'index': 'gettid', 'count': 1},
{'index': 'getuid32', 'count': 3}]}]}
mk_wordcloud(df_train[df_train.malware_types==3])
{'data': [{'name': 'dataset',
'values': [{'index': 'execve', 'count': 300},
{'index': 'mmap2', 'count': 2292},
{'index': 'cacheflush', 'count': 126},
{'index': 'readlink', 'count': 2463},
{'index': 'mprotect', 'count': 1342},
{'index': 'brk', 'count': 632},
{'index': 'munmap', 'count': 20},
{'index': 'set_tls', 'count': 208},
{'index': 'ioctl', 'count': 182},
{'index': 'open', 'count': 4601},
{'index': 'gettimeofday', 'count': 43829},
{'index': 'getpid', 'count': 167},
{'index': 'getppid', 'count': 160},
{'index': 'times', 'count': 20},
{'index': 'read', 'count': 80110},
{'index': 'socket', 'count': 373},
{'index': 'setsockopt', 'count': 314},
{'index': 'connect', 'count': 194},
{'index': 'getsockname', 'count': 60},
{'index': 'close', 'count': 2743},
{'index': 'clone', 'count': 104},
{'index': 'exit_group', 'count': 153},
{'index': 'EXIT', 'count': 160},
{'index': 'setsid', 'count': 20},
{'index': 'rt_sigaction', 'count': 2074},
{'index': 'clock_gettime', 'count': 87298},
{'index': 'stat64', 'count': 1617},
{'index': 'uname', 'count': 152},
{'index': 'send', 'count': 134},
{'index': 'poll', 'count': 116},
{'index': 'recv', 'count': 116},
{'index': 'sendto', 'count': 94},
{'index': 'recvfrom', 'count': 21},
{'index': 'rmdir', 'count': 40},
{'index': 'unlink', 'count': 55},
{'index': 'fstat64', 'count': 1330},
{'index': 'fcntl', 'count': 424},
{'index': 'getdents', 'count': 433},
{'index': 'rt_sigprocmask', 'count': 413},
{'index': 'wait4', 'count': 231},
{'index': 'SIGCHLD', 'count': 140},
{'index': 'sigreturn', 'count': 91},
{'index': 'prctl', 'count': 160},
{'index': 'msgget', 'count': 20},
{'index': 'msgctl', 'count': 40},
{'index': 'mknod', 'count': 20},
{'index': 'bind', 'count': 20},
{'index': 'mkdir', 'count': 20},
{'index': '_newselect', 'count': 43644},
{'index': 'msgrcv', 'count': 43619},
{'index': 'set_tid_address', 'count': 280},
{'index': 'fcntl64', 'count': 1260},
{'index': 'getuid32', 'count': 91},
{'index': 'flock', 'count': 140},
{'index': 'getsockopt', 'count': 280},
{'index': 'writev', 'count': 100},
{'index': 'mmap', 'count': 28},
{'index': 'time', 'count': 14},
{'index': 'fork', 'count': 56},
{'index': 'exit', 'count': 7},
{'index': 'stat', 'count': 550},
{'index': 'set_thread_area', 'count': 98},
{'index': 'getuid', 'count': 49},
{'index': 'fstat', 'count': 98},
{'index': 'getdents64', 'count': 208}]}]}
mk_wordcloud(df_train[df_train.malware_types==4])
{'data': [{'name': 'dataset',
'values': [{'index': 'execve', 'count': 388},
{'index': 'ioctl', 'count': 1967},
{'index': 'open', 'count': 1797},
{'index': 'fork', 'count': 281},
{'index': 'exit', 'count': 29},
{'index': 'EXIT', 'count': 372},
{'index': 'time', 'count': 4186},
{'index': 'getpid', 'count': 257},
{'index': 'getppid', 'count': 407},
{'index': 'socket', 'count': 1995},
{'index': 'brk', 'count': 265},
{'index': 'read', 'count': 2546},
{'index': 'close', 'count': 3580},
{'index': 'connect', 'count': 3602},
{'index': 'send', 'count': 525},
{'index': 'poll', 'count': 527},
{'index': 'recv', 'count': 537},
{'index': 'rt_sigprocmask', 'count': 3631},
{'index': 'rt_sigaction', 'count': 4099},
{'index': 'nanosleep', 'count': 1880},
{'index': 'setsockopt', 'count': 19},
{'index': 'write', 'count': 109},
{'index': '_newselect', 'count': 12},
{'index': 'mmap', 'count': 47},
{'index': 'cacheflush', 'count': 66},
{'index': 'readlink', 'count': 47},
{'index': 'mprotect', 'count': 385},
{'index': 'munmap', 'count': 22},
{'index': 'getsockname', 'count': 3},
{'index': 'mmap2', 'count': 724},
{'index': 'set_tls', 'count': 13},
{'index': 'clone', 'count': 89},
{'index': 'exit_group', 'count': 343},
{'index': 'gettimeofday', 'count': 1825},
{'index': 'pipe', 'count': 66},
{'index': 'fcntl64', 'count': 717},
{'index': 'vfork', 'count': 3},
{'index': 'dup2', 'count': 263},
{'index': 'set_tid_address', 'count': 348},
{'index': 'fstat64', 'count': 384},
{'index': 'prctl', 'count': 348},
{'index': 'getuid32', 'count': 8},
{'index': 'uname', 'count': 156},
{'index': 'stat64', 'count': 1150},
{'index': 'wait4', 'count': 286},
{'index': 'SIGCHLD', 'count': 299},
{'index': 'sigreturn', 'count': 152},
{'index': 'gettid', 'count': 181},
{'index': 'writev', 'count': 126},
{'index': 'fcntl', 'count': 35},
{'index': 'stat', 'count': 314},
{'index': 'getrlimit', 'count': 4},
{'index': 'setrlimit', 'count': 4},
{'index': 'set_thread_area', 'count': 342},
{'index': 'lstat64', 'count': 65},
{'index': 'access', 'count': 35},
{'index': 'flock', 'count': 3},
{'index': 'bind', 'count': 5},
{'index': 'listen', 'count': 4},
{'index': 'getuid', 'count': 488},
{'index': 'waitpid', 'count': 70},
{'index': 'getcwd', 'count': 5},
{'index': '_llseek', 'count': 62},
{'index': 'geteuid', 'count': 60},
{'index': 'readv', 'count': 140},
{'index': 'chdir', 'count': 62},
{'index': 'sendfile64', 'count': 196},
{'index': 'rename', 'count': 20},
{'index': 'unlink', 'count': 20},
{'index': 'utimensat', 'count': 20},
{'index': 'pipe2', 'count': 2},
{'index': 'times', 'count': 1},
{'index': 'fstat', 'count': 28},
{'index': 'getdents64', 'count': 113},
{'index': 'mkdir', 'count': 1},
{'index': 'epoll_create1', 'count': 1},
{'index': 'epoll_ctl', 'count': 4},
{'index': 'clock_gettime', 'count': 8},
{'index': 'sendto', 'count': 2},
{'index': 'recvfrom', 'count': 2},
{'index': 'epoll_pwait', 'count': 1},
{'index': 'chmod', 'count': 27}]}]}
mk_wordcloud(df_train[df_train.malware_types==5])
{'data': [{'name': 'dataset',
'values': [{'index': 'execve', 'count': 2304},
{'index': 'uname', 'count': 1326},
{'index': 'brk', 'count': 1585},
{'index': 'set_tls', 'count': 2084},
{'index': 'set_tid_address', 'count': 2304},
{'index': 'set_robust_list', 'count': 568},
{'index': 'futex', 'count': 304},
{'index': 'rt_sigaction', 'count': 22665},
{'index': 'rt_sigprocmask', 'count': 21981},
{'index': 'ugetrlimit', 'count': 152},
{'index': 'readlink', 'count': 341},
{'index': 'getcwd', 'count': 172},
{'index': 'clone', 'count': 1480},
{'index': 'wait4', 'count': 984},
{'index': 'SIGCHLD', 'count': 1084},
{'index': 'SIGSEGV', 'count': 60},
{'index': 'KILL', 'count': 60},
{'index': 'open', 'count': 62469},
{'index': 'fcntl64', 'count': 2890},
{'index': 'fstat64', 'count': 10882},
{'index': 'read', 'count': 59008},
{'index': 'mmap2', 'count': 13556},
{'index': 'close', 'count': 95226},
{'index': 'mprotect', 'count': 4534},
{'index': 'prctl', 'count': 2132},
{'index': 'getuid32', 'count': 1932},
{'index': 'getpid', 'count': 1084},
{'index': 'getppid', 'count': 1084},
{'index': 'stat64', 'count': 7824},
{'index': 'clock_gettime', 'count': 516},
{'index': 'ioctl', 'count': 8315},
{'index': 'fchmod', 'count': 516},
{'index': 'fchown32', 'count': 456},
{'index': 'readv', 'count': 1032},
{'index': 'writev', 'count': 624},
{'index': 'rename', 'count': 516},
{'index': 'exit_group', 'count': 1196},
{'index': 'EXIT', 'count': 1196},
{'index': 'write', 'count': 463},
{'index': 'setsid', 'count': 112},
{'index': 'socket', 'count': 3959},
{'index': 'connect', 'count': 3846},
{'index': '_newselect', 'count': 3840},
{'index': 'getsockopt', 'count': 3714},
{'index': 'nanosleep', 'count': 10580},
{'index': 'lseek', 'count': 43543},
{'index': 'gettimeofday', 'count': 6406},
{'index': 'munmap', 'count': 8894},
{'index': 'send', 'count': 2060},
{'index': 'cacheflush', 'count': 259},
{'index': 'dup2', 'count': 360},
{'index': 'lstat64', 'count': 108},
{'index': 'unlink', 'count': 36},
{'index': 'set_thread_area', 'count': 220},
{'index': 'getrlimit', 'count': 20},
{'index': 'waitpid', 'count': 100},
{'index': 'getuid', 'count': 200},
{'index': 'fchown', 'count': 60}]}]}
pd.set_option('display.max_columns', 50)
default_train = df_train
df_train.insert(1, "Length", df_train.Commands.apply(lambda x: len(str(x).split())))
df_train.head()
| Commands | Length | malware_types | |
|---|---|---|---|
| 0 | execve ioctl ioctl prctl time getpid time getp... | 354 | 2 |
| 1 | execve uname brk brk set_tls set_tid_address s... | 427 | 5 |
| 2 | execve brk brk set_tls ioctl ioctl access gete... | 11381 | 1 |
| 3 | execve ioctl ioctl unlink time getpid getppid ... | 87034 | 2 |
| 4 | execve ioctl ioctl time getpid time getpid soc... | 455 | 2 |
df_train.corr()
| Length | malware_types | |
|---|---|---|
| Length | 1.000000 | -0.245999 |
| malware_types | -0.245999 | 1.000000 |
x_train, y_train = df_train.drop(['malware_types'], axis=1).to_numpy(), df_train.malware_types.to_numpy()
df_test.insert(1, "Length", df_test.Commands.apply(lambda x: len(str(x).split())))
df_test.head()
| Commands | Length | |
|---|---|---|
| 0 | execve mmap2 cacheflush cacheflush readlink ca... | 558 |
| 1 | execve brk brk set_tls ioctl ioctl rt_sigprocm... | 791 |
| 2 | execve ioctl ioctl time getpid time getpid soc... | 483 |
| 3 | execve ioctl ioctl time getpid time getpid soc... | 23823 |
| 4 | execve brk brk set_tls ioctl ioctl gettimeofda... | 470 |
default_train = pd.concat([pd.DataFrame(x_train), pd.Series(y_train)], axis=1, ignore_index=True)
default_train.columns=['Commands', 'Length', 'malware_types']
default_train.head()
| Commands | Length | malware_types | |
|---|---|---|---|
| 0 | execve ioctl ioctl prctl time getpid time getp... | 354 | 2 |
| 1 | execve uname brk brk set_tls set_tid_address s... | 427 | 5 |
| 2 | execve brk brk set_tls ioctl ioctl access gete... | 11381 | 1 |
| 3 | execve ioctl ioctl unlink time getpid getppid ... | 87034 | 2 |
| 4 | execve ioctl ioctl time getpid time getpid soc... | 455 | 2 |
pd.DataFrame(default_train.malware_types.value_counts())
| malware_types | |
|---|---|
| 2 | 1359 |
| 1 | 925 |
| 5 | 172 |
| 4 | 24 |
| 3 | 20 |
# we need a custom pre-processor to extract correct field,
# but want to also use default scikit-learn preprocessing (e.g. lowercasing)
def get_features(df):
df[['Commands', 'Length']]=df[['Commands', 'Length']].astype(str)
default_preprocessor = CountVectorizer().build_preprocessor()
def build_preprocessor(field):
field_idx = list(df.columns).index(field)
return lambda x: default_preprocessor(x[field_idx])
vectorizer = FeatureUnion([
('Length', CountVectorizer(
token_pattern='\d+',
preprocessor=build_preprocessor('Length'))),
('Commands', TfidfVectorizer(
ngram_range=(2, 5) , sublinear_tf= True,
preprocessor=build_preprocessor('Commands'), )),
])
x_train_fit = vectorizer.fit_transform(df[['Commands', 'Length']].values)
x_train_fit = pd.DataFrame(x_train_fit.toarray(), columns=vectorizer.get_feature_names())
x_train_trans = vectorizer.fit_transform(df[['Commands', 'Length']].values)
x_train_trans = pd.DataFrame(x_train_trans.toarray(), columns=vectorizer.get_feature_names())
return vectorizer, x_train_fit, x_train_trans
vectorizer_default, x_trainfit_default, x_train_trans_default = get_features(default_train)
x_train_trans_default.shape
(2500, 21852)
vectorizer_test, x_testfit, x_test_trans = get_features(df_test)
x_test_trans.shape
(1667, 18642)
t1 = x_train_trans_default
t2 = x_test_trans
solved :D
x_train_trans_default['label'] = 'train'
x_test_trans['label'] = 'test'
df_temp = pd.concat([x_train_trans_default,x_test_trans])
df_temp.shape
(4167, 26452)
df_temp.label.value_counts()
train 2500 test 1667 Name: label, dtype: int64
x_train_trans_default = df_temp.loc[df_temp['label'] == 'train']
x_test_trans = df_temp.loc[df_temp['label'] == 'test']
x_train_trans_default.shape
(2500, 26452)
x_test_trans.shape
(1667, 26452)
x_train_trans_default = x_train_trans_default.drop(['label'], axis=1);
x_test_trans = x_test_trans.drop(['label'], axis=1);
x_train_trans_default = x_train_trans_default.fillna(0);
x_test_trans = x_test_trans.fillna(0);
x_train_trans_default.shape
(2500, 26451)
x_test_trans.shape
(1667, 26451)
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
models = [LogisticRegression(), LGBMClassifier(), KNeighborsClassifier()]
def cv_df_formation(x_val, y_val):
from sklearn.model_selection import cross_val_score
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, x_val, y_val, scoring='accuracy', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
return cv_df
cv_df_default = cv_df_formation(x_train_trans_default, y_train)
cv_df_default
| model_name | fold_idx | accuracy | |
|---|---|---|---|
| 0 | LogisticRegression | 0 | 0.952 |
| 1 | LogisticRegression | 1 | 0.964 |
| 2 | LogisticRegression | 2 | 0.968 |
| 3 | LogisticRegression | 3 | 0.968 |
| 4 | LogisticRegression | 4 | 0.970 |
| 5 | LGBMClassifier | 0 | 0.982 |
| 6 | LGBMClassifier | 1 | 0.982 |
| 7 | LGBMClassifier | 2 | 0.980 |
| 8 | LGBMClassifier | 3 | 0.972 |
| 9 | LGBMClassifier | 4 | 0.984 |
| 10 | KNeighborsClassifier | 0 | 0.952 |
| 11 | KNeighborsClassifier | 1 | 0.960 |
| 12 | KNeighborsClassifier | 2 | 0.962 |
| 13 | KNeighborsClassifier | 3 | 0.968 |
| 14 | KNeighborsClassifier | 4 | 0.968 |
bias_train = df_train.drop("malware_types", axis=1)
bias_y_train = df_train.malware_types
bias_train.columns = ['Commands', 'Length']
vectorizer_bias, x_trainfit_bias, x_train_trans_bias = get_features(bias_train)
test_perc_list = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
estimator=LGBMClassifier()
cv=5
n_jobs=-1
train_sizes, train_scores, test_scores = learning_curve(estimator,x_train_trans_bias, bias_y_train, cv=cv, n_jobs=n_jobs, train_sizes=test_perc_list)
def get_mean(arr):
final= []
for i in range(arr.shape[0]):
final.append(arr[i].mean())
return final
trainFinal,testFinal = get_mean(train_scores),get_mean(test_scores)
import plotly.graph_objs as go
fig=go.Figure()
x,y,z = test_perc_list, trainFinal, testFinal
fig.add_trace(go.Scatter(x=x , y=y , name='Train Score', line_shape='linear'))
fig.add_trace(go.Scatter(x=x , y=z , name='Test Score', line_shape='linear'))
fig.update_layout(title_text = 'Bias Variance TradeOff')
fig.update_xaxes(title_text='Train Size', showgrid=False)
fig.update_yaxes(title_text='Accuracy', showgrid=False)
fig.show()
model_selected = LGBMClassifier()
model_selected.fit(x_train_trans_default,y_train)
LGBMClassifier()
y_pred_lgbm = cross_val_predict(model_selected, x_train_trans_default, y_train, cv = 4)
conf_mx = confusion_matrix(y_train, y_pred_lgbm)
print(conf_mx)
plt.matshow(conf_mx, cmap=plt.cm.Blues)
[[ 894 29 0 2 0] [ 16 1341 0 2 0] [ 0 0 20 0 0] [ 3 0 0 21 0] [ 0 0 0 0 172]]
<matplotlib.image.AxesImage at 0x1fda0e894c0>
print(classification_report(y_train, y_pred_lgbm))
precision recall f1-score support
1 0.98 0.97 0.97 925
2 0.98 0.99 0.98 1359
3 1.00 1.00 1.00 20
4 0.84 0.88 0.86 24
5 1.00 1.00 1.00 172
accuracy 0.98 2500
macro avg 0.96 0.97 0.96 2500
weighted avg 0.98 0.98 0.98 2500
model = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
importance_type='split', learning_rate=0.08583748564200731,
max_depth=11, min_child_samples=30, min_child_weight=0.001,
min_split_gain=0.0, n_estimators=919, n_jobs=-1, num_leaves=60,
objective=None, random_state=42, reg_alpha=1.5543843815592084,
reg_lambda=6.263494698476247, silent=True,
subsample=0.6000000000000001, subsample_for_bin=200000,
subsample_freq=16)
model.fit(x_test_trans)
mlw_val_pred = model_selected.predict(x_test_trans)
mlw_val_pred
array([2, 1, 2, ..., 2, 3, 2], dtype=int64)
df_test_backup = pd.DataFrame(pd.read_csv("D:/UGM/DTETI FT UGM/!!MatKul!!/Semester 5/Pemrosesan Bahasa Alami/CDMC2019MiniTask/Test.csv", sep=',', header=None))
df_test_backup
| 0 | |
|---|---|
| 0 | 54 |
| 1 | 1460 |
| 2 | 1312 |
| 3 | 1230 |
| 4 | 3077 |
| ... | ... |
| 1662 | 3281 |
| 1663 | 824 |
| 1664 | 3380 |
| 1665 | 1311 |
| 1666 | 487 |
1667 rows × 1 columns
mlw_detection_pred = pd.DataFrame({
"file_num": df_test_backup[0],
"mlw_pred": mlw_val_pred.astype(int)
})
mlw_detection_pred.to_csv('mlw_detection_pred.csv', index=False)
mlw_detection_pred
| file_num | mlw_pred | |
|---|---|---|
| 0 | 54 | 2 |
| 1 | 1460 | 1 |
| 2 | 1312 | 2 |
| 3 | 1230 | 1 |
| 4 | 3077 | 2 |
| ... | ... | ... |
| 1662 | 3281 | 1 |
| 1663 | 824 | 2 |
| 1664 | 3380 | 2 |
| 1665 | 1311 | 3 |
| 1666 | 487 | 2 |
1667 rows × 2 columns